********************************************************************************
* clean_firm.do
* Purpose: Create the firm-level annual datasets used for propensity-score
*          matching and event-study analysis.
*
* For each year 2001-2017:
*   - Loads NALMF firm accounts (revenue, employment, payroll, assets, etc.)
*   - Computes firm age and average wage
*   - Constructs 2-digit NAICS sector codes (collapsed industry groups)
*   - Merges supplementary variables: public listing flag, demand elasticity
*     (for markup computation), worker-based payroll, and M&A characteristics
*   - Flags whether the firm is an M&A participant (MnA_firm)
*   -> Saves one file per year: firm_YYYY.dta
*
* Also builds firm_level_emp.dta: a lean firm x year panel of employment,
* used in akm_estimation.do and match_effect_estimation.do for sample selection.
*
* Inputs:
*   $data1/NALMF_YYYY.dta        - Raw firm accounts (National Accounts LMF)
*   $data2/nalmf_add_YYYY.dta    - Supplementary NALMF variables (public flag)
*   $data/theta_mc1.dta          - Demand elasticity by 2-digit industry x year
*   $data/emp_panel.dta          - T4-based firm employment panel (from worker_firm_panel.do)
*   $data/first_mna.dta          - First M&A event per firm (from clean_mna.do)
*
* Outputs:
*   $data/firm_YYYY.dta          - Firm-level matching data, one file per year
*   $data/firm_level_emp.dta     - Firm x year employment (lean panel)
********************************************************************************

//------------------------------------------------------------------------------
// SECTION 1: ANNUAL FIRM-LEVEL DATASETS (firm_YYYY.dta)
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data1/NALMF_`y', clear

	//------------------------------------------------------------------
	// Firm age: computed from the firm's incorporation date (BirthDate)
	//------------------------------------------------------------------
	gen dateinc		 = dofc(BirthDate)      // convert SIF datetime to Stata date
	gen dateinc_year = yofd(dateinc)         // extract year of incorporation
	gen age			 = year - dateinc_year   // age = current year - birth year

	//------------------------------------------------------------------
	// Average wage per worker: total T4 payroll / average employment
	// (NALMF-based measure; T4-based alternative is avg_payrolls below)
	//------------------------------------------------------------------
	gen avg_wage = T4_Payroll/PD7_AvgEmp_NonZero

	//------------------------------------------------------------------
	// 2-digit NAICS sector codes
	// NAICS codes are 6-digit; the first 2 digits identify the sector.
	// Several adjacent sectors are grouped together for consistency with
	// Statistics Canada industry classifications and to ensure sufficient
	// cell sizes in matching.
	//   31-33 -> 31 (Manufacturing)
	//   44-45 -> 44 (Retail Trade)
	//   48-49 -> 48 (Transportation and Warehousing)
	//   54, 56, 61, 62 -> 54 (Professional / Business / Education / Health)
	//------------------------------------------------------------------
	destring naics, replace
	gen 	naics2 		= int(naics/100)     // 2-digit NAICS from 6-digit code
	gen 	ind2d 		= naics2             // preserve original 2-digit code for elasticity merge
	replace naics2  	= 31 if naics2 == 32 | naics2 == 33
	replace naics2  	= 44 if naics2 == 45
	replace naics2  	= 48 if naics2 == 49
	replace naics2 		= 54 if naics2 == 56 | naics2 == 61	| naics2 ==62

	//------------------------------------------------------------------
	// Merge supplementary variables
	//------------------------------------------------------------------

	* Public listing flag (1 = publicly listed firm)
	merge m:1 entid_syn using $data2/nalmf_add_`y', keep(1 3) keepusing(public_flag) nogen

	* Demand elasticity (theta) by 2-digit industry x year, used to compute markups:
	*   Markup = theta * (sales / cost_of_sales)
	merge m:1 ind2d year using $data/theta_mc1, 	keep(1 3) nogen

	* Worker-based average payroll from T4ROE records (alternative payroll measure)
	merge m:1 entid_syn year using $data/emp_panel,	keep(1 3) keepusing(avg_payrolls) nogen

	//------------------------------------------------------------------
	// M&A characteristics: merge in first_mna to flag M&A firms and
	// record their deal year. joinby handles the m:1 relationship.
	// MnA_firm == 1 for firms that ever had an M&A event.
	//------------------------------------------------------------------
	joinby entid_syn using $data/first_mna, unmatched(master)
	gen 	MnA_firm = 1 if _merge == 3
	replace MnA_firm = 0 if _merge == 1
	drop _merge

	compress
	save $data/firm_`y', replace
}


//------------------------------------------------------------------------------
// SECTION 2: LEAN FIRM EMPLOYMENT PANEL (firm_level_emp.dta)
//
// Used in akm_estimation.do and match_effect_estimation.do to apply the
// sample restriction: firms with fewer than 5 average employees are dropped.
// Keeps only the three variables needed for that purpose.
//------------------------------------------------------------------------------
drop _all
forvalues y = 2001/2017 {

	* Append from raw NALMF; keep only identifier and employment variable
	append using $data1/NALMF_`y', keep(entid_syn year PD7_AvgEmp_NonZero) force
}
compress
save $data/firm_level_emp, replace
